input data: - /home/rstudio/work/data/ClinicalTable Cell Reports.txt
workflow1: Q1. Is change of body feature (reduce amount of BMI and/or Waist/Hip value …) is correspond to change of the expression profile cluster?
Problem: Is it OK to just OB - POB count? Need normalization? -> It would be better not to take difference of insuline responde gene profile, but extract insuline response gene in sub group.
Workflow2: 1. Modify table to have 2 column OB vs POB.
2. Check the distribution of OB - POB value of clinic values to get sub
groups of the indivisuals. -> better cured vs less cured 3. extract
insulin response gene in those sub groups. 4. Check the different
insulin response gene between OB sub and POB sub group -> extention
of Fig2a.
Q2. Dose 7 enriched TF has same activity ratio though the target? 1. create small directed network of 7 TFs to insuline responde gene. 2. culculate ratio of the target activity (induce, attenuate) 3. Find TF which is commonly enriched but differentry active the targets.
-> This analysis may not be find difference because correlation of the insulin responce gene profile between groups in FigureS2 shows correlation.
library(tidyverse)
load("~/work/data/ExpressionTables.RData")
clinic <- read_tsv("~/work/data/ClinicalTable Cell Reports.txt")
Rows: 151 Columns: 17── Column specification ───────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): Patnr, Time, Type, Gender
dbl (13): Age, BMI, Waist, Hip, Syst_BT, Diast_BT, P_glukos, Kol, TG, Ins, HDL, M_v_rde_60_120, cellvol
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Add W/H value which is also one of the score to varidate obisity.
clinic$wh <- clinic$Waist / clinic$Hip
get person ID to get match of person.
clinic$ID <- substr(clinic$Patnr, start = 3, stop = nchar(clinic$Patnr))
clinic$Location <- substr(clinic$Patnr, start = 1, stop = 2)
get the sample ID in expression count matrix.
CAGE_ID <- grep("f0", colnames(phase2.female.counts), value=TRUE)
CAGE_ID <- unlist(strsplit(CAGE_ID, "f0"))
CAGE_ID
[1] "NG1" "NG2" "NG3" "NG4" "NK13" "NK14" "NK15" "NK22" "NK25" "NK26" "NK27" "NK29"
[13] "NK2" "NK35" "NK37" "NK38" "NK39" "NK3" "NK40" "NK44" "NK7" "NK8" "NK9" "NO101"
[25] "NO108" "NO117" "NO121" "NO125" "NO131" "NO135" "NO146" "NO150" "NO22" "NO27" "NO32" "NO37"
[37] "NO40" "NO47" "NO48" "NO50" "NO51" "NO56" "NO68" "NO73" "NO77" "NO89"
filter Male data, NO group, row with NA data.
filterd <- clinic %>%
filter(Patnr %in% CAGE_ID) %>%
filter(!Gender == "M") %>%
filter(Location == "NO") %>%
group_by(Type)
This matrix is wide format. It will be more easy to handle long format. Column will be ID, name of the value, value of OB, value of POB, OB - POB.
OB_long <- filter(filterd, Type=="OBESE") %>%
pivot_longer(cols = c("BMI", "wh", "Syst_BT", "Diast_BT", "P_glukos", "Kol", "TG", "Ins", "HDL", "M_v_rde_60_120", "cellvol"), names_to = "score", values_to = "OB") %>%
dplyr::select(!c(Time, Gender,Waist, Hip,Age, Location))
POB_long <- filter(filterd, Type=="POST_OBESE") %>%
pivot_longer(cols = c("BMI", "wh", "Syst_BT", "Diast_BT", "P_glukos", "Kol", "TG", "Ins", "HDL", "M_v_rde_60_120", "cellvol"), names_to = "score", values_to = "POB") %>%
dplyr::select(!c(Time, Gender,Waist, Hip,Age, Location))
merged_matrix <- merge(OB_long, POB_long, by = c("ID", "score", "Patnr"), all = TRUE)
merged_matrix$OB_POB <- merged_matrix$OB - merged_matrix$POB
merged_matrix$OB_POB_ratio_100 <- merged_matrix$POB / merged_matrix$OB * 100 - 100
merged_matrix$OB_POB_ratio <- log2(merged_matrix$POB / merged_matrix$OB)
head(merged_matrix)
Check parson is N = 23.
length(unique(merged_matrix$ID))
[1] 23
Boxplot of distribution of the score which is difference between OB and POB (OB -POB). Cellvol is high range conpare to other scores.
p <- ggplot(merged_matrix,aes(x=score, y=OB_POB)) +
geom_boxplot() +
geom_point()
# facet_wrap(~score, ncol=4)
print(p)
Separatery plot ranking bar plot to see the distribution.
p <- ggplot(merged_matrix, aes(x = fct_reorder(score, OB_POB_ratio, .fun = median), y=OB_POB_ratio)) +
geom_boxplot() +
geom_point() +
labs(title = "Change of clinical status from OB to POB", x="clinical collected scores", y = "log2(POB/OB)")
print(p)
Score which decrease in POB: - cell volume - Insulin level - TG - BMI - Kol
Score which increase in POB: - HDL - M value
POB decrease the scores those un-healthy if it is high score, and increase the scores those healthy if it is high score.
Plot of “Change of clinical status from OB to POB” shows ratio of the change from OB to POB. This plot may not show the importance for the measurement which even small differences are important such as blood pressure. In order to visualize the actual changes, boxplot that relates the corresponding individual data are shown.
longer <- merged_matrix %>%
pivot_longer(cols = c("OB", "POB"), names_to = "states", values_to = "value")
longer
#FIXME: better to plot using violine plot.
for (i in unique(longer$score)) { # Loop over loop.vector
# store data in column.i as x
sub_df <- longer %>%
dplyr::filter(score == i)
# Plot histogram of x
p <- ggplot(sub_df, aes(x = interaction(states, score), y = value)) +
geom_boxplot(aes(fill = states), alpha = 0.5) +
geom_line(aes(group = interaction(ID, score)),
alpha = 0.5, colour = "darkgrey") +
scale_x_discrete(labels = "") +
labs(title = paste0("Change of clinical status from OB to POB: ", i), x="states", y = i)
show(p)
}
ggplot(longer, aes(x = interaction(states, score), y = value)) +
geom_boxplot(aes(fill = states), alpha = 0.5) +
geom_line(aes(group = interaction(ID, score)),
alpha = 0.5, colour = "darkgrey") +
facet_grid(~score,scales="free_x") +
scale_x_discrete(labels = "")
Next see the correlation between those scores.
pob_ob <- pivot_wider(merged_matrix, id_cols = ID, names_from = score, values_from=OB_POB_ratio) %>%
drop_na()%>%
select_if(is.numeric) %>%
cor()
Diast blood tension and Syst blood tention is highly correlated. BMI and cellvol is correlated, and TG and HDl is negatively correlated. # FIXME: notmalize or standarize.
library(pheatmap)
pheatmap(pob_ob, display_numbers=TRUE)
features <- pivot_wider(merged_matrix, id_cols = ID, names_from = score, values_from=OB_POB_ratio) %>%
select_if(is.numeric) %>%
drop_na()
pca_res <- prcomp(t(features), scale. = TRUE)
pca_res
Standard deviations (1, .., p=11):
[1] 3.901010e+00 1.475656e+00 9.522736e-01 5.847826e-01 4.015955e-01 3.164272e-01 2.136506e-01
[8] 1.701989e-01 1.195847e-01 7.379906e-02 1.837866e-16
Rotation (n x k) = (19 x 11):
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
[1,] -0.2389693 -0.15706700 0.03367265 -0.456370068 0.08715553 0.0432987246 -0.093424849 0.04023031
[2,] -0.2526831 -0.10154594 0.04648646 0.054991975 -0.00300070 0.1465249546 0.023961244 -0.01864974
[3,] -0.2085492 0.28897977 -0.23806437 0.468374508 0.38655026 0.1124163737 -0.289942125 -0.09109109
[4,] -0.2407646 0.21137860 0.11572953 -0.014183847 0.17118279 0.0039758582 0.275523634 0.03750508
[5,] -0.2508307 0.01395563 0.03261808 0.240514803 0.16164249 -0.3854093377 -0.194384262 0.12284822
[6,] -0.2284203 -0.22849317 -0.27973491 -0.071040734 -0.29105045 -0.0008117684 -0.303190414 0.15307950
[7,] -0.2157009 -0.25819230 0.05065685 0.504417756 -0.54746427 0.2696720866 0.108299591 0.19583769
[8,] -0.2457352 0.08771806 0.18631173 0.161884311 -0.21337026 -0.3761714322 0.129873508 -0.03517396
[9,] -0.2349074 0.23602477 -0.16791612 -0.126633629 -0.20040501 0.1030342121 0.020143994 0.01071500
[10,] -0.2254031 0.28634826 0.16132313 -0.226457387 -0.02156809 0.1051722429 0.330484333 0.14967664
[11,] -0.1831216 -0.31516058 0.54379479 -0.094546460 0.01790683 -0.0740666914 -0.052811921 -0.17855184
[12,] -0.2525226 -0.03571501 0.05242185 -0.198682345 0.06661564 0.1056496690 -0.105537329 0.47604956
[13,] -0.2488526 0.07380956 -0.14882375 0.110479305 0.06334065 0.1909859757 0.519920568 -0.29685329
[14,] -0.1493354 -0.45306355 -0.44141169 -0.002864675 0.30280568 -0.3703047293 0.375465579 0.24402211
[15,] -0.2067767 -0.37391986 -0.06248052 -0.044976290 0.27197732 0.4898363202 -0.133342669 -0.32642299
[16,] -0.2371219 -0.08112319 0.35101027 0.130936952 0.14256891 -0.2083489761 -0.048181514 -0.21801628
[17,] -0.2396594 0.07012770 -0.27815604 -0.196951759 -0.28215884 -0.3119598629 -0.209314341 -0.48370286
[18,] -0.2344084 0.23853044 -0.15329916 -0.199103260 -0.14064195 0.0370589531 -0.006196166 -0.06996915
[19,] -0.2381066 0.22986006 0.11128983 0.006507977 0.15102091 0.0852406102 -0.271320914 0.29761913
PC9 PC10 PC11
[1,] -0.39593127 -0.40447908 0.4812431088
[2,] 0.14740218 -0.27502337 -0.4767519006
[3,] -0.10838645 -0.21944941 0.1747766435
[4,] 0.06784204 -0.09299340 -0.1268098289
[5,] -0.07654583 0.09381908 -0.1115586616
[6,] -0.23941379 -0.21973989 -0.3789963384
[7,] 0.09613289 -0.10620697 0.3033058307
[8,] -0.28233197 0.27285888 -0.0192654412
[9,] 0.07120092 0.21884948 0.3170117386
[10,] 0.09645009 -0.18891100 -0.0005117691
[11,] -0.19131590 0.04378103 -0.1195697618
[12,] 0.23404931 0.45604815 0.0293430140
[13,] -0.38607618 0.16536115 -0.0995236966
[14,] 0.13738157 -0.02920401 0.0804678906
[15,] 0.12060623 0.32763441 -0.0215555168
[16,] 0.46347625 -0.21267854 0.2264762873
[17,] 0.14357544 0.16259950 0.0899823001
[18,] 0.31111602 -0.19164600 -0.2280054930
[19,] -0.18051321 0.15399313 -0.0464949834
it is hard to say there is a sub-group in 23 (19 after drop na) individual. #FIXME: Check contribution
library(ggfortify)
autoplot(pca_res, data = t(features))
get_sorted <- function(df, feature){
base <- df %>% filter(score == feature) %>%
drop_na() %>%
arrange(OB_POB_ratio)
return(base)
}
n = 7
CV <- get_sorted(merged_matrix, "cellvol")
CV_head <- head(CV, n)
CV_tail <- tail(CV, n)
Ins <- get_sorted(merged_matrix, "Ins")
Ins_head <- head(Ins, n)
Ins_tail <- tail(Ins, n)
MV <- get_sorted(merged_matrix, "M_v_rde_60_120")
MV_head <- head(MV, n)
MV_tail <- tail(MV, n)
Change of those values are different in individual. This infomation is also able to assumed by correlation result. Those scores has less correlated means, change of scores are different amoung individuals.
library(ggVennDiagram)
x <- list(CV_cure=CV_head$Patnr, Ins_cure=Ins_head$Patnr, MV_cure=MV_tail$Patnr)
ggVennDiagram(x, show_intersect=TRUE)
Warning: Ignoring unknown aesthetics: text
x <- list(CV_less=CV_tail$Patnr, Ins_less=Ins_tail$Patnr, MV_less=MV_head$Patnr)
ggVennDiagram(x, show_intersect=TRUE)
Warning: Ignoring unknown aesthetics: text
Insuline respond subgroup based on Insuline level.
cols_to_select <- Ins_head$Patnr
selected_cols <- phase2.female.counts %>%
dplyr::select(contains(cols_to_select))
selected_cols
load("/home/rstudio/work/data/ExpressionTables.RData")
load("/home/rstudio/work/data/Annotation.RData")
load("/home/rstudio/work/data/Cohort.RData")
cohort_2 <- cohort %>%
dplyr::filter(Patnr %in% cols_to_select)
library(edgeR)
##################################
# DE analysis of paired samples
###################################
cohort <- cohort_2
IDs<-paste(cohort$OB_NO,as.factor(cohort$IDs),sep=".")
Condition<-cohort$newcond2
# deviding expression matrix to subgroup
df.pairs <- selected_cols
# DE Analysis
y <- DGEList(counts=df.pairs,group = Condition)
y
design <- model.matrix(~0+Condition+IDs,data=y$samples)
design <- design[,!grepl("f0", colnames(design))]
design <- design[, colSums(design != 0) > 0]
colnames(design)<-make.names(colnames(design))
y <- calcNormFactors(y,norm.method="RLE")
y <- estimateDisp(y,design)
fit <- glmFit(y,design,robust=T)
my.contrasts<-makeContrasts(
HivsF.OB=ConditionOBESE.h0,
HivsF.POB=ConditionOBESE.h2-ConditionOBESE.f2,
levels=design)
contrasts<-colnames(my.contrasts)
myfun <- function(filex) {
print(filex)
lrt <- glmLRT(fit, contrast=my.contrasts[,filex])
out.top=topTags(lrt,n=Inf,adjust.method='BH')
out.adj=out.top$table
out.adj.1<-out.adj
if(nrow(out.adj.1)==0){
return()
}else{
out.adj.1$Contrast<-filex
out.adj.1$TC<-rownames(out.adj.1)
out.adj.1$updown<-ifelse(out.adj.1$logFC>0,"up","down")
return(out.adj.1)
}
}
y.list<-(lapply(contrasts, myfun))
tab<-do.call(rbind, y.list)
tab$Sign<-ifelse(tab$FDR<0.05,"sign","no")
res.count <- table(tab[tab$Sign=="sign",]$Contrast,tab[tab$Sign=="sign",]$updown )
res.count
edgeR result before has 441 down regulated gene. This is not seems right. It is because design matrix is not propary setted.
What I want to see is different insuline response gene between + group (well cured) and - group (mild cured). It means, POB is not one but separated to 2.
NO will be also matched with Age, M-value and BMI mean of POB+ and POB-.
Age_df <- data.frame(POBp = POBp$Age, POBm = POBm$Age)
ggplot(data = Age_df) +
geom_point()
Error in `geom_point()`:
! Problem while setting up geom.
ℹ Error occurred in the 1st layer.
Caused by error in `compute_geom_1()`:
! `geom_point()` requires the following missing aesthetics: x and y
Backtrace:
1. base (local) `<fn>`(x)
2. ggplot2:::print.ggplot(x)
4. ggplot2:::ggplot_build.ggplot(x)
5. ggplot2:::by_layer(...)
12. ggplot2 (local) f(l = layers[[i]], d = data[[i]])
13. l$compute_geom_1(d)
14. ggplot2 (local) compute_geom_1(..., self = self)
First get matched NO groups.
# get NO with matched distribution with POB+ and POB-
set.seed(123)
load("/home/rstudio/work/data/Cohort.RData")
POBp <- clinic %>%
dplyr::filter(Patnr %in% Ins_head$Patnr)
POBm <- clinic %>%
dplyr::filter(Patnr %in% Ins_tail$Patnr)
NOs <- clinic %>% dplyr::filter(Type =="NO")
select_NO <- NOs %>%
dplyr::filter(max(POBp$Age) > Age, Age> min(POBp$Age)) %>%
dplyr::filter(max(POBp$BMI) > BMI, BMI> min(POBp$BMI)) %>%
dplyr::filter(max(POBp$M_v_rde_60_120) > M_v_rde_60_120, M_v_rde_60_120> min(POBp$M_v_rde_60_120)) %>%
dplyr::filter(Patnr %in% cohort$Patnr) %>%
sample_n(7)
Also select 6 of OB (each 3 from POB+, POB- matched OB).
OBp_ID <- sample(Ins_head$Patnr, 4)
OBm_ID <- sample(Ins_tail$Patnr, 3)
OB_ID <- c(OBp_ID, OBm_ID)
# Data to run edgeR
print(OB_ID) # OB group
[1] "NO51" "NO22" "NO27" "NO77" "NO101" "NO48" "NO146"
print(select_NO$Patnr) # NO group
[1] "NG3" "NK8" "NK35" "NG2" "NK26" "NK7" "NK25"
print(Ins_head$Patnr) # POB+ group
[1] "NO27" "NO73" "NO51" "NO77" "NO117" "NO135" "NO22"
print(Ins_tail$Patnr) # POB- group
[1] "NO101" "NO89" "NO32" "NO131" "NO146" "NO50" "NO48"
# column names in the count matrix
OB_col <- cohort %>%
dplyr::filter(OB_NO_POB == "OB") %>%
dplyr::filter(Patnr %in% OB_ID) %>%
select(Cond)
NO_col <- cohort %>%
dplyr::filter(OB_NO_POB == "NO") %>%
dplyr::filter(Patnr %in% select_NO$Patnr) %>%
select(Cond)
POBp_col <- cohort %>%
dplyr::filter(OB_NO_POB == "POB") %>%
dplyr::filter(Patnr %in% Ins_head$Patnr) %>%
select(Cond)
POBm_col <- cohort %>%
dplyr::filter(OB_NO_POB == "POB") %>%
dplyr::filter(Patnr %in% Ins_tail$Patnr) %>%
select(Cond)
select_sample <- c(OB_col$Cond, NO_col$Cond, POBp_col$Cond, POBm_col$Cond)
# select samples to use
df.pairs <- phase2.female.counts %>%
dplyr::select(contains(select_sample))
load("/home/rstudio/work/data/Cohort.RData")
# subset cohort file
cohort_2 <- cohort %>%
dplyr::filter(Cond %in% select_sample)
cohort <- cohort_2
cohort$condition <- ifelse(cohort$Cond %in% OB_col$Cond, "OB",
ifelse(cohort$Cond %in% NO_col$Cond, "NO",
ifelse(cohort$Cond %in% POBp_col$Cond, "POBp",
ifelse(cohort$Cond %in% POBm_col$Cond, "POBm","NA"))))
cohort <- cohort %>%
mutate(time = str_sub(Cond,-2,-1))
cohort <- cohort %>%
mutate(condition2 = paste(cohort$condition, cohort$time,sep="."))
IDs<-paste(cohort$OB_NO,as.factor(cohort$IDs),sep=".") # put same ID for same indivisual
Condition<-cohort$condition2
# DE Analysis
y <- DGEList(counts=df.pairs,group = Condition)
# design: Condition (OB,NO,POBp,POBm) and indivisual
design<-model.matrix(~0+Condition+IDs,data=y$samples)
design <- design[,!grepl("f0", colnames(design))]
#design <- design[,!grepl("NO.f", colnames(design))]
#design<-design[, colSums(design != 0) > 0]
colnames(design)<-make.names(colnames(design))
y <- calcNormFactors(y,norm.method="RLE")
y <- estimateDisp(y,design)
fit <- glmFit(y,design,robust=T)
my.contrasts<-makeContrasts(
HivsF.NO=ConditionNO.h0,
HivsF.OB=ConditionOB.h0,
HivsF.POBp=ConditionPOBp.h2-ConditionPOBp.f2,
HivsF.POBm=ConditionPOBm.h2-ConditionPOBm.f2,
levels=design)
contrasts<-colnames(my.contrasts)
myfun <- function(filex) {
print(filex)
lrt <- glmLRT(fit, contrast=my.contrasts[,filex])
out.top=topTags(lrt,n=Inf,adjust.method='BH')
out.adj=out.top$table
out.adj.1<-out.adj
if(nrow(out.adj.1)==0){
return()
}else{
out.adj.1$Contrast<-filex
out.adj.1$TC<-rownames(out.adj.1)
out.adj.1$updown<-ifelse(out.adj.1$logFC>0,"up","down")
return(out.adj.1)
}
}
y.list<-(lapply(contrasts, myfun))
[1] "HivsF.NO"
[1] "HivsF.OB"
[1] "HivsF.POBp"
[1] "HivsF.POBm"
tab<-do.call(rbind, y.list)
tab$Sign<-ifelse(tab$FDR<0.05,"sign","no")
res.count <- table(tab[tab$Sign=="sign",]$Contrast,tab[tab$Sign=="sign",]$updown )
res.count
down up
HivsF.NO 472 0
HivsF.OB 3 0
HivsF.POBm 0 1
save(tab,file="/home/rstudio/work/data/OB7_NO7_POBp7_POBm7.RData")
Remove NO
select_sample <- c(OB_col$Cond, POBp_col$Cond, POBm_col$Cond)
# select samples to use
df.pairs <- phase2.female.counts %>%
dplyr::select(contains(select_sample))
load("/home/rstudio/work/data/Cohort.RData")
# subset cohort file
cohort_2 <- cohort %>%
dplyr::filter(Cond %in% select_sample)
cohort <- cohort_2
cohort$condition <- ifelse(cohort$Cond %in% OB_col$Cond, "OB",
ifelse(cohort$Cond %in% POBp_col$Cond, "POBp",
ifelse(cohort$Cond %in% POBm_col$Cond, "POBm","NA")))
cohort <- cohort %>%
mutate(time = str_sub(Cond,-2,-1))
cohort <- cohort %>%
mutate(condition2 = paste(cohort$condition, cohort$time,sep="."))
IDs<-paste(cohort$OB_NO,as.factor(cohort$IDs),sep=".") # put same ID for same indivisual
Condition<-cohort$condition2
y <- DGEList(counts=df.pairs,group = Condition)
design<-model.matrix(~0+Condition+IDs,data=y$samples)
design <- design[,!grepl("f0", colnames(design))]
design<-design[, colSums(design != 0) > 0]
colnames(design)<-make.names(colnames(design))
y <- calcNormFactors(y,norm.method="RLE")
y <- estimateDisp(y,design)
fit <- glmFit(y,design,robust=T)
my.contrasts<-makeContrasts(
HivsF.OB=ConditionOB.h0,
HivsF.POBp=ConditionPOBp.h2-ConditionPOBp.f2,
HivsF.POBm=ConditionPOBm.h2-ConditionPOBm.f2,
levels=design)
contrasts<-colnames(my.contrasts)
myfun <- function(filex) {
print(filex)
lrt <- glmLRT(fit, contrast=my.contrasts[,filex])
out.top=topTags(lrt,n=Inf,adjust.method='BH')
out.adj=out.top$table
out.adj.1<-out.adj
if(nrow(out.adj.1)==0){
return()
}else{
out.adj.1$Contrast<-filex
out.adj.1$TC<-rownames(out.adj.1)
out.adj.1$updown<-ifelse(out.adj.1$logFC>0,"up","down")
return(out.adj.1)
}
}
y.list<-(lapply(contrasts, myfun))
[1] "HivsF.OB"
[1] "HivsF.POBp"
[1] "HivsF.POBm"
tab<-do.call(rbind, y.list)
tab$Sign<-ifelse(tab$FDR<0.05,"sign","no")
res.count <- table(tab[tab$Sign=="sign",]$Contrast,tab[tab$Sign=="sign",]$updown )
res.count
down up
HivsF.OB 678 51
HivsF.POBm 5 0
HivsF.POBp 3 0
save(tab,file="/home/rstudio/work/data/OB6_POBp6_POBm6.RData")